#!/usr/bin/env python
# -*- coding:utf-8 -*- 
# @Time    : 2018/12/5 17:49
# @Author  : liujiantao
# @Site    : https://zhuanlan.zhihu.com/p/32265553
# @File    : chimerge_woe_iv.py
# @Software: PyCharm
import pandas as pd
import numpy as np
import math


def ChiMerge(df, variable, flag, confidenceVal=3.841, bin=10, sample=None):
    """
    example
    data = pd.read_csv('sample_data.csv', sep="\t", na_values=['', '?'])
    data = pd.read_csv('E:/breast_cancer.csv', sep=',')
    temp = data[['radius_mean','diagnosis']]
    temp2=ChiMerge(temp,'radius_mean' , 'diagnosis',
    confidenceVal=5.841, bin=5, sample = None)
     # 定义一个卡方分箱（可设置参数置信度水平与箱的个数）停止条件为大于置信水平且小于bin的数目
    运行前需要 import pandas as pd 和 import numpy as np
    df:传入一个数据框仅包含一个需要卡方分箱的变量与正负样本标识（正样本为1，负样本为0）
    variable:需要卡方分箱的变量名称（字符串）
    flag：正负样本标识的名称（字符串）
    confidenceVal：置信度水平（默认是不进行抽样95%）
    bin：最多箱的数目
    sample: 为抽样的数目（默认是不进行抽样），因为如果观测值过多运行会较慢
    """
    # 进行是否抽样操作
    if sample != None:
        df = df.sample(n=sample)

    # 进行数据格式化录入
    total_num = df.groupby([variable])[flag].count()  # 统计需分箱变量每个值数目
    total_num = pd.DataFrame({'total_num': total_num})  # 创建一个数据框保存之前的结果
    positive_class = df.groupby([variable])[flag].sum()  # 统计需分箱变量每个值正样本数
    positive_class = pd.DataFrame({'positive_class': positive_class})  # 创建一个数据框保存之前的结果
    regroup = pd.merge(total_num, positive_class, left_index=True, right_index=True,
                       how='inner')  # 组合total_num与positive_class
    regroup.reset_index(inplace=True)
    regroup['negative_class'] = regroup['total_num'] - regroup['positive_class']  # 统计需分箱变量每个值负样本数
    regroup = regroup.drop('total_num', axis=1)
    np_regroup = np.array(regroup)  # 把数据框转化为numpy（提高运行效率）
    print('已完成数据读入,正在计算数据初处理')

    # 处理连续没有正样本或负样本的区间，并进行区间的合并（以免卡方值计算报错）
    i = 0
    while (i <= np_regroup.shape[0] - 2):
        if ((np_regroup[i, 1] == 0 and np_regroup[i + 1, 1] == 0) or (
                np_regroup[i, 2] == 0 and np_regroup[i + 1, 2] == 0)):
            np_regroup[i, 1] = np_regroup[i, 1] + np_regroup[i + 1, 1]  # 正样本
            np_regroup[i, 2] = np_regroup[i, 2] + np_regroup[i + 1, 2]  # 负样本
            np_regroup[i, 0] = np_regroup[i + 1, 0]
            np_regroup = np.delete(np_regroup, i + 1, 0)
            i = i - 1
        i = i + 1

    # 对相邻两个区间进行卡方值计算
    chi_table = np.array([])  # 创建一个数组保存相邻两个区间的卡方值
    for i in np.arange(np_regroup.shape[0] - 1):
        chi = (np_regroup[i, 1] * np_regroup[i + 1, 2] - np_regroup[i, 2] * np_regroup[i + 1, 1]) ** 2 \
              * (np_regroup[i, 1] + np_regroup[i, 2] + np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) / \
              ((np_regroup[i, 1] + np_regroup[i, 2]) * (np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) * (
                      np_regroup[i, 1] + np_regroup[i + 1, 1]) * (np_regroup[i, 2] + np_regroup[i + 1, 2]))
        chi_table = np.append(chi_table, chi)
    print('已完成数据初处理，正在进行卡方分箱核心操作')

    # 把卡方值最小的两个区间进行合并（卡方分箱核心）
    while (1):
        if (len(chi_table) <= (bin - 1) and min(chi_table) >= confidenceVal):
            break
        chi_min_index = np.argwhere(chi_table == min(chi_table))[0]  # 找出卡方值最小的位置索引
        np_regroup[chi_min_index, 1] = np_regroup[chi_min_index, 1] + np_regroup[chi_min_index + 1, 1]
        np_regroup[chi_min_index, 2] = np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 2]
        np_regroup[chi_min_index, 0] = np_regroup[chi_min_index + 1, 0]
        np_regroup = np.delete(np_regroup, chi_min_index + 1, 0)

        if (chi_min_index == np_regroup.shape[0] - 1):  # 最小值试最后两个区间的时候
            # 计算合并后当前区间与前一个区间的卡方值并替换
            chi_table[chi_min_index - 1] = (np_regroup[chi_min_index - 1, 1] * np_regroup[chi_min_index, 2] -
                                            np_regroup[chi_min_index - 1, 2] * np_regroup[chi_min_index, 1]) ** 2 \
                                           * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2] +
                                              np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \
                                           ((np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2]) * (
                                                   np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (
                                                    np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index, 1]) * (
                                                    np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 2]))
            # 删除替换前的卡方值
            chi_table = np.delete(chi_table, chi_min_index, axis=0)

        else:
            # 计算合并后当前区间与前一个区间的卡方值并替换
            chi_table[chi_min_index - 1] = (np_regroup[chi_min_index - 1, 1] * np_regroup[chi_min_index, 2] -
                                            np_regroup[chi_min_index - 1, 2] * np_regroup[chi_min_index, 1]) ** 2 \
                                           * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2] +
                                              np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \
                                           ((np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2]) * (
                                                   np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (
                                                    np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index, 1]) * (
                                                    np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 2]))
            # 计算合并后当前区间与后一个区间的卡方值并替换
            chi_table[chi_min_index] = (np_regroup[chi_min_index, 1] * np_regroup[chi_min_index + 1, 2] - np_regroup[
                chi_min_index, 2] * np_regroup[chi_min_index + 1, 1]) ** 2 \
                                       * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2] + np_regroup[
                chi_min_index + 1, 1] + np_regroup[chi_min_index + 1, 2]) / \
                                       ((np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (
                                               np_regroup[chi_min_index + 1, 1] + np_regroup[chi_min_index + 1, 2]) * (
                                                np_regroup[chi_min_index, 1] + np_regroup[chi_min_index + 1, 1]) * (
                                                np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 2]))
            # 删除替换前的卡方值
            chi_table = np.delete(chi_table, chi_min_index + 1, axis=0)
    print('已完成卡方分箱核心操作，正在保存结果')

    # 把结果保存成一个数据框
    result_data = pd.DataFrame()  # 创建一个保存结果的数据框
    result_data['variable'] = [variable] * np_regroup.shape[0]  # 结果表第一列：变量名
    list_temp = []
    for i in np.arange(np_regroup.shape[0]):
        if i == 0:
            # x = '0' + ',' + str(np_regroup[i, 0])
            x = np_regroup[i, 0]
        elif i == np_regroup.shape[0] - 1:
            # x = str(np_regroup[i - 1, 0])
            x = np_regroup[i - 1, 0]
        else:
            # x = str(np_regroup[i - 1, 0]) + ',' + str(np_regroup[i, 0])
            x = np_regroup[i, 0]
        list_temp.append(x)
    # result_data['interval'] = list_temp  # 结果表第二列：区间
    # result_data['flag_0'] = np_regroup[:, 2]  # 结果表第三列：负样本数目
    # result_data['flag_1'] = np_regroup[:, 1]  # 结果表第四列：正样本数目

    return list_temp


##example iv,df=iv_value('E:/breast_cancer.csv','radius_mean',(10,15,20),'diagnosis')
def woe_iv_value(file, feature, sep, target):
    """
    inputfile:dataframe所在输入文件
    feture:需要分析的特征变量
    sep 分段表达式
    target y变量
    """
    ###sep格式为（10，15，20）
    data = pd.read_csv(file, sep=',')
    # data = pd.read_csv('E:/breast_cancer.csv', sep=',')
    # woe
    # sep_value = sep.split(',')
    sep_value = str(sep).replace('(', '').replace(')', '').split(',')
    sep_len = len(sep_value)
    dict_bin = {}
    class_bin = {}
    len_dict_bin = {}
    len_dict_bin_0 = {}
    len_dict_bin_1 = {}
    woe_bin = {}
    iv_bin = {}
    if sep_len == 1:
        dict_bin[0] = data.loc[data[feature] <= float(sep_value[0]), :]
        dict_bin[1] = data.loc[data[feature] > float(sep_value[0]), :]
        dict_bin[2] = sum(data[feature].isnull())
        len_dict_bin[0] = len(dict_bin[0])
        len_dict_bin[1] = len(dict_bin[1])
        len_dict_bin[2] = len(dict_bin[2])
        class_bin[0] = "(0," + sep_value[0] + "]"
        class_bin[1] = "(" + sep_value[0] + "...)"
        class_bin[2] = "NA"
    else:
        for index, item in enumerate(sep_value):  ####区间
            if index == 0:
                dict_bin[0] = data.loc[data[feature] <= float(item), :]
                len_dict_bin[0] = len(dict_bin[0])
                class_bin[0] = "(0," + str(float(item)) + "]"
            else:
                dict_bin[index] = (
                    data.loc[(data[feature] >= float(sep_value[index - 1])) & (data[feature] < float(item)),
                    :])
                len_dict_bin[index] = len(dict_bin[index])
                class_bin[index] = "(" + str(sep_value[index - 1]) + "," + str(sep_value[index]) + "]"
        dict_bin[index + 1] = data.loc[data[feature] > float(item), :]
        dict_bin[index + 2] = data.loc[data[feature].isnull()]
        len_dict_bin[index + 1] = len(dict_bin[index + 1])
        len_dict_bin[index + 2] = len(dict_bin[index + 2])
        class_bin[index + 1] = "(" + str(sep_value[index]) + "...)"
        class_bin[index + 2] = "NA"

    for index, item in enumerate(dict_bin):
        len_dict_bin_0[index] = len(dict_bin[index][dict_bin[index][target] == 0])
        len_dict_bin_1[index] = len(dict_bin[index][dict_bin[index][target] == 1])

    len_data_0 = len(data[data[target] == 0])
    len_data_1 = len(data[data[target] == 1])
    for index, item in enumerate(dict_bin):
        try:
            woe_bin[index] = math.log(math.e, (float(len_dict_bin_1[index]) / float(len_data_1)) / (
                    float(len_dict_bin_0[index]) / float(len_data_0)))
            iv_bin[index] = ((float(len_dict_bin_1[index]) / float(len_data_1)) - (
                    float(len_dict_bin_0[index]) / float(len_data_0))) * math.log(math.e, (
                    float(len_dict_bin_1[index]) / float(len_data_1)) / (float(len_dict_bin_0[index]) / float(
                len_data_0)))
        except Exception as e:
            iv_bin[index] = 0
    iv_sum = 0.0
    for key in iv_bin:
        try:
            iv_sum = iv_sum + float(iv_bin[key])
        except Exception as e:
            print(e)
            return iv_sum

    dict_result = {}
    len_dict_bin_0[" "] = len_data_0
    len_dict_bin_1[" "] = len_data_1
    woe_bin[" "] = ""
    iv_bin[" "] = sum(iv_bin.values())
    class_bin[" "] = ""
    len_dict_bin[" "] = len(data)
    dict_result["bad"] = len_dict_bin_0
    dict_result["good"] = len_dict_bin_1
    dict_result["all"] = len_dict_bin
    dict_result["woe"] = woe_bin
    dict_result["iv"] = iv_bin
    dict_result["class"] = class_bin
    df = pd.DataFrame(dict_result)

    dict_result["%good"] = (df['good'] / df['all']).map('{:.2%}'.format);
    dict_result["%bad"] = (df['bad'] / df['all']).map('{:.2%}'.format);
    df["%good"] = dict_result["%good"]
    df["%bad"] = dict_result["%bad"]

    # 调整列的顺序
    df = df.ix[:, ['class', 'good', 'bad', '%good', '%bad', 'all', 'woe', 'iv']]
    # print df
    return df


print("finish!!")
